## Stephen Moon
## Statistics Thesis

library(pscl)
library(tidyverse)
set.seed(5555)
train.prop <- 0.75

setwd("/Users/smoon/Desktop/Thesis/Data/")
results <- read.csv("RacesWithDIME.csv", header = T, as.is = T)
results$year.x <- as.factor(results$year.x)
results <- results[-which(results$year.x == 1980),]
N <- nrow(results)

#train-test split
train.idx <- sample(1:N, size = train.prop*N, replace = F)
train <- results[train.idx,]
test <- results[-train.idx,]

#try splitting the models by party
#pres vote share threshold for blue dist
threshold <- 0.45

train.D <- train[train$DemPresVs >= threshold,]
test.D <- test[test$DemPresVs >= threshold,]

train.R <- train[train$DemPresVs <= 1 - threshold,]
test.R <- test[test$DemPresVs <= 1 - threshold,]

#fit the full model for each party
#missingness model
Dmod <- glm(uncontested ~ as.factor(year.x) + prcntBA +
             prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
             partyControl + prcntForeignBorn + under10k + over35k +
             over50k + over75k + over100k + over150k + prcntNotEmploy +
             gini + prcntOld + lagPresDiff50 +
             medianIncome + lagUncontested +
             lagWinMargin, data = train.D, family = binomial)
summary(Dmod)
pR2(Dmod)

Rmod <- glm(uncontested ~ as.factor(year.x) + prcntBA +
              prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
              partyControl + prcntForeignBorn + under10k + over35k +
              over50k + over75k + over100k + over150k + prcntNotEmploy +
              gini + prcntOld + lagPresDiff50 +
              medianIncome + lagUncontested +
              lagWinMargin, data = train.D, family = binomial)
summary(Rmod)
pR2(Rmod)



